What’s covered in this lecture?

1 Introductory Web Scraping

The URL: http://brandirectory.com/league_tables/table/global-500-2018

1.1 Web Parsing

  • Use Google Chrome web browser, right click to view page source.
  • You need to get familiar with some basic HTML and CSS …
  • Right click to carefully Inspect the pages to identify the data for collection.
library(rvest)
webpage <- read_html("http://brandirectory.com/league_tables/table/global-500-2018")
xdata <- webpage %>% html_nodes("tbody tr") 
DataX = NULL
for (i in 1:length(xdata)){
  tmp <- xdata[i] %>% html_nodes("td") 
  rank17 <- tmp[1] %>% html_nodes("span") %>% html_text()  
  rank16 <- tmp[2] %>% html_text()  %>% as.numeric()
  logo <- tmp[3] %>% html_nodes("img") %>% xml_attr("src")
  company <- tmp[4] %>% html_text()
  flag <- tmp[5] %>% html_nodes("img") %>% xml_attr("src") 
  value17 <- tmp[6] %>% html_nodes(".h") %>% html_text()  
  if(!length( value17))  value17 = NA
  value16 <- tmp[7] %>% html_nodes(".h") %>% html_text()  
  if(!length(value16)) value16 = NA
  rate17 <- tmp[8] %>% html_nodes("span") %>% html_text()  
  if(!length(rate17)) rate17 = NA 
  rate16 <- tmp[9] %>% html_nodes("span") %>% html_text()  
  if(!length(rate16)) rate16 = NA 
  DataX = rbind(DataX, c(rank17, rank16, company, logo, flag, 
                         value17,value16, rate17, rate16))
  }
# xname <- webpage %>% html_nodes(".col-sm-9 .main th") %>% html_text() 
colnames(DataX) = c("Rank17", "Rank16", "Company", "Logo", "Flag", 
                    "Value17", "Value16", "Rate17", "Rate16")
DataX = as.data.frame(DataX)
knitr::kable(head(DataX), format="html")
write.csv(DataX,  file="TopBrand2018.csv", row.names=F)

1.2 Data Preprocessing

DataX = read.csv("TopBrand2018.csv")
summary(DataX)  # Before preprocessing ... to detect the data problems ... 
##      Rank17          Rank16                                  Company   
##  Min.   :  1.0   Min.   :  1.0    CVS Health                     :  1  
##  1st Qu.:125.8   1st Qu.:115.5    Sumitomo Mitsui Financial Group:  1  
##  Median :250.5   Median :231.0   20th Century Fox                :  1  
##  Mean   :250.5   Mean   :234.0   3 Mobile                        :  1  
##  3rd Qu.:375.2   3rd Qu.:346.5   3M                              :  1  
##  Max.   :500.0   Max.   :500.0   7-Eleven                        :  1  
##                  NA's   :41      (Other)                         :494  
##                                                         Logo    
##  /images/profile/logo/2000px_macys_logo_cms.jpg           :  1  
##  /images/profile/logo/2000px_morgan_stanley_logo_1_cms.jpg:  1  
##  /images/profile/logo/2000px_youtube_logo_2017_cms.jpg    :  1  
##  /images/profile/logo/20th_century_fox_logo.jpg           :  1  
##  /images/profile/logo/3_mobile_3.png                      :  1  
##  /images/profile/logo/3m.jpg                              :  1  
##  (Other)                                                  :494  
##                    Flag        Value17          Value16           Rate17     
##  /images/flags/us.png:193   Min.   : 14635   Min.   :     0   Min.   :17.00  
##  /images/flags/cn.png: 60   1st Qu.: 18537   1st Qu.: 16242   1st Qu.:21.00  
##  /images/flags/jp.png: 36   Median : 22246   Median : 21944   Median :22.00  
##  /images/flags/fr.png: 35   Mean   : 32536   Mean   : 28122   Mean   :21.92  
##  /images/flags/gb.png: 29   3rd Qu.: 37502   3rd Qu.: 32032   3rd Qu.:23.00  
##  /images/flags/de.png: 24   Max.   :150811   Max.   :109470   Max.   :24.00  
##  (Other)             :123   NA's   :400      NA's   :400      NA's   :400    
##      Rate16     
##  Min.   : 0.00  
##  1st Qu.:21.00  
##  Median :22.00  
##  Mean   :21.46  
##  3rd Qu.:23.00  
##  Max.   :24.00  
##  NA's   :400
DataX$Company = as.character(DataX$Company)
DataX$Logo = as.character(DataX$Logo)
DataX$Flag = as.character(DataX$Flag)
DataX$Rank17 = as.numeric(DataX$Rank17)
DataX$Rank16 = as.numeric(DataX$Rank16)
DataX$Value17 = as.numeric(DataX$Value17)
DataX$Value16 = as.numeric(DataX$Value16)
DataX$Rate17 = as.numeric(DataX$Rate17)
DataX$Rate16 = as.numeric(DataX$Rate16)
summary(DataX) # After preprocessing ... thus develop the data sense ... 
##      Rank17          Rank16        Company              Logo               Flag          
##  Min.   :  1.0   Min.   :  1.0   Length:500         Length:500         Length:500        
##  1st Qu.:125.8   1st Qu.:115.5   Class :character   Class :character   Class :character  
##  Median :250.5   Median :231.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :250.5   Mean   :234.0                                                           
##  3rd Qu.:375.2   3rd Qu.:346.5                                                           
##  Max.   :500.0   Max.   :500.0                                                           
##                  NA's   :41                                                              
##     Value17          Value16           Rate17          Rate16     
##  Min.   : 14635   Min.   :     0   Min.   :17.00   Min.   : 0.00  
##  1st Qu.: 18537   1st Qu.: 16242   1st Qu.:21.00   1st Qu.:21.00  
##  Median : 22246   Median : 21944   Median :22.00   Median :22.00  
##  Mean   : 32536   Mean   : 28122   Mean   :21.92   Mean   :21.46  
##  3rd Qu.: 37502   3rd Qu.: 32032   3rd Qu.:23.00   3rd Qu.:23.00  
##  Max.   :150811   Max.   :109470   Max.   :24.00   Max.   :24.00  
##  NA's   :400      NA's   :400      NA's   :400     NA's   :400
DataX$Country = gsub("/images/flags/", "", DataX$Flag)
DataX$Country = gsub(".png", "", DataX$Country)
DataX$Country = as.factor(DataX$Country)
barplot(sort(summary(DataX$Country), decreasing = T)[1:10], 
        col=5, main="Top 10 Countries with Top Brands")

1.3 Image Downloading

library(curl)
if(!dir.exists("images")) dir.create("images")
if(!dir.exists("images/profile")) dir.create("images/profile")
if(!dir.exists("images/profile/logo")) dir.create("images/profile/logo")
for (i in 1:nrow(DataX)) 
  curl_download(url = paste("http://brandirectory.com", DataX$Logo[i], sep=""), 
                destfile = paste("./", DataX$Logo[i], sep=""))

if(!dir.exists("images/flags")) dir.create("images/flags")
flaglist = unique(DataX$Flag)
for (i in 1:length(flaglist)) 
  curl_download(url = paste("http://brandirectory.com", flaglist[i], sep=""), 
                destfile = paste("./", flaglist[i], sep=""))

2 Top Brand Data Visualization

2.1 Data Preprocessing

Step 1: Start with data loading and preprocessing. For simplicity, we omit missing values.

DataX = read.csv("BrandFinance.csv")
DataX = DataX[, c("Year", "Rank", "RankLastyear", "Company", "Value", "Rate", "Country", "Sector")]
DataX$Value[DataX$Value==-1] = NA
DataX$Rate[DataX$Rate==-1] = NA
DataX$RankLastyear = as.numeric(as.character(DataX$RankLastyear))
DataX = na.omit(DataX)
summary(DataX)   
##       Year           Rank        RankLastyear                Company        Value       
##  Min.   :2009   Min.   :  1.0   Min.   :  1.00   Allianz         :  9   Min.   :  3955  
##  1st Qu.:2011   1st Qu.: 25.0   1st Qu.: 25.00   Amazon.com      :  9   1st Qu.: 12475  
##  Median :2013   Median : 49.5   Median : 50.00   American Express:  9   Median : 16607  
##  Mean   :2013   Mean   : 49.9   Mean   : 55.51   Apple           :  9   Mean   : 20498  
##  3rd Qu.:2015   3rd Qu.: 75.0   3rd Qu.: 77.75   AT&T            :  9   3rd Qu.: 23007  
##  Max.   :2017   Max.   :100.0   Max.   :391.00   Bank of America :  9   Max.   :145918  
##                                                  (Other)         :824                   
##       Rate          Country                   Sector   
##  Min.   : 0.00   us     :403   Banks             :159  
##  1st Qu.:20.00   jp     : 90   Technology        :150  
##  Median :21.00   cn     : 78   Telecommunications:105  
##  Mean   :20.98   de     : 73   Retail            : 90  
##  3rd Qu.:22.00   gb     : 54   Auto Manufacturers: 70  
##  Max.   :24.00   fr     : 51   Oil&Gas           : 54  
##                  (Other):129   (Other)           :250
tmp = sort(summary(DataX$Sector), decreasing = T)[1:5]
barplot(tmp, col=5, space=0, xaxt = "n", yaxt="n",
        main="Top 5 sectors with Top 100 Brands")
text(x= c(1:length(tmp))-0.6, 2, names(tmp), cex=1.2, pos=4, srt=90, xpd=TRUE)

TopSector = names(tmp)

2.2 Creative Bubble Charts

Step 2: Think creatively how the top brand ranking/values can be visualized …

levels(DataX$Sector)[!is.element(levels(DataX$Sector),TopSector)] = "Others"
DataX$Sector = factor(DataX$Sector, c(TopSector, "Others"))
Colmap = adjustcolor(1+seq(1,nlevels(DataX$Sector)), alpha.f=0.6)
xlim0=c(-10, max(DataX$RankLastyear))
ylim0=c(-5,max(DataX$Rank))
BubblePlot <- function(DataX,Year){
  TmpX = DataX[DataX$Year == Year, ]
  TmpX$Country = factor(TmpX$Country)
  Size = 1+9*(TmpX$Value-min(DataX$Value))/diff(range(DataX$Value))
  par(mar=c(4,4,3,3))
  plot(TmpX$RankLastyear, TmpX$Rank, 
       xlim=xlim0, ylim=rev(ylim0),
       pch=20, col=Colmap[TmpX$Sector], cex=Size,
       xlab = paste(Year-1, "Ranking"), ylab = paste(Year, "Ranking"), 
       main = paste("Year", Year))
  legend("topright", levels(DataX$Sector), pch=20, col=Colmap)
  }
BubblePlot(DataX, 2017)

2.3 Let It Animate

Step 3: Generate the animation with year frames

library(magick)
ListYear = sort(unique(DataX$Year))
Img <- image_graph(500, 500, res = 72)
for (k in 1:length(ListYear)) BubblePlot(DataX, ListYear[k])
dev.off()
Img %>% image_trim() %>% image_animate(fps = 1) %>% image_write("TopBrands.gif")

2.4 Let It Be Interactive

Step 4: You are right, we are talking about Plotly …

library(plotly)
Year = 2017
TmpX = DataX[DataX$Year == Year, ]
TmpX$Country = factor(TmpX$Country)
TmpX$Size = 1+19*(TmpX$Value-min(DataX$Value))/diff(range(DataX$Value))
plot_ly(TmpX, x = ~RankLastyear, y = ~Rank, type="scatter", mode = "markers", 
        size = ~Size, color = ~Sector,
        hoverinfo = 'text',
        text = ~paste("", Company)) %>%
  layout(xaxis = list(range = xlim0, 
                      zeroline=FALSE,
                      title=paste(Year-1, "Ranking")),
         yaxis = list(range = ylim0, 
                      autorange="reversed", 
                      zeroline=FALSE, 
                      title=paste(Year, "Ranking"))
         )
library(ggplot2)
gg <- ggplot(DataX, aes(RankLastyear, Rank, color = Sector, text=Company)) +
  geom_point(aes(size = Value, frame = Year)) + 
  scale_y_reverse(lim=c(100,-5))
ggplotly(gg)